source('https://raw.githubusercontent.com/fraba/R_cheatsheet/master/database.R')
db_store <- 'm5s_fb_public_figures_mar2015.sqlite'
db_target <- 'm5s_fb_target_pages.sqlite'

# Geolocate
require(readr)
# DOWNLOAD from http://download.geonames.org/export/dump/
geonames <- read_delim("IT.txt", "\t", # NOT provided
                       escape_double = FALSE, col_names = FALSE, 
                       trim_ws = TRUE)

geonames <- subset(geonames, X7 %in% c('P','A'))
geonames <- subset(geonames, !grepl("H$",X8))

require(tm)
stopwords_it <- stopwords(kind='it')
pasteIntoRegex <- function(x2, x4) {
  vec <- tolower(x2)
  if (!is.na(x4)) {
    splt <- tolower(strsplit(x4,",")[[1]])
    is_invalid <- grepl('[^[:ascii:]àèéìòùáíóú]', splt, perl=TRUE, ignore.case=TRUE)
    splt <- splt[!is_invalid]
    vec <- c(vec, splt[splt != vec]) 
  }
  
  vals <- c("\\\\", "\\[", "\\]", "\\(", "\\)", 
            "\\{", "\\}", "\\^", "\\$","\\*", 
            "\\+", "\\?", "\\.", "\\|")
  replace.vals <- paste0("\\\\", vals)
  for(i in seq_along(vals)){
    vec <- gsub(vals[i], replace.vals[i], vec)
  }
  
  # Remove stop words
  vec <- vec[!(vec %in% stopwords_it)]
  
  vec <- sapply(vec, function(x) paste0("\\b",x,"\\b"))
  return(paste(vec, collapse = "|"))
}

geonames$regex <- mapply(pasteIntoRegex, geonames$X2, geonames$X4)
geonames <- subset(geonames, regex != "")

page_details <- sqliteGetTable(db_target, 'target')

page_details$gn <- NA


# check if is_person 
library(readr)
italian_names_by_gender <- read_csv("italian_names_by_gender.csv", 
                                      col_names = FALSE)

containsPersonName <- function(str) {
  require(stringr)
  splt <- unlist(str_split(str, ' '))
  return(any(splt %in% italian_names_by_gender$X1))
}

page_details$is_person <- sapply(page_details$name.x, containsPersonName)
names(page_details)[2] <- 'id'

getGNId <- function(page_details, p, var) {
  
  # Loop 1 (grepl)
  results <- list()
  for (id in subset(page_details, is_person == p & is.na(gn))$id) {
    print(id)
    results[[id]] <- 
      which(sapply(geonames$regex, function(x) grepl(x, page_details[[var]][page_details$id == id], ignore.case = T)))
  }
  
  # Loop 2 (distance)
  require(stringr)
  require(stringdist)
  for (id in names(results)) {
    print(id)
    if(length(results[[id]]) == 0) {
      next
    } else if (length(results[[id]])>1) {
      geonames_names <- geonames$X2[results[[id]]]
      
      # Closest
      geo_entities <- 
        sapply(names(results[[id]]), 
               function(x) str_extract(tolower(page_details[[var]][page_details$id == id]), x))
      max_nchar <- max(nchar(geo_entities))
      geo_entities <- geo_entities[which(nchar(geo_entities) == max_nchar)]
      
      dist <- stringdistmatrix(geo_entities, geonames_names)
      min_dist <- min(dist)
      
      geonames_names <- geonames_names[which(apply(dist, 2, function(x) min_dist %in% x))]
      
      if (length(geonames_names) == 1) {
        page_details$gn[page_details$id == id] <- geonames$X1[geonames$X2 == geonames_names]
      } else {
        # Or lower administrative level 
        geonames_pop <- geonames$X15[geonames$X2 %in% geonames_names]
        page_details$gn[page_details$id == id] <- 
          geonames$X1[geonames$X2 %in% geonames_names & geonames$X15 == min(geonames_pop, na.rm = T)]
      }
    } else {
      page_details$gn[page_details$id == id] <- geonames$X1[results[[id]]]
    }
  }
  return(page_details)
}

page_details <- getGNId(page_details, p = FALSE, var = 'name.x')

page_details <- merge(page_details, geonames[,c('X1','X5','X6','X7','X8')], by.x = 'gn', by.y = 'X1', all.x = TRUE)
names(page_details)[15:18] <- c('lat','lon', 'gn_feature_class', 'gn_feature_code')

save(page_details, file = '02_04_m5s_meetup_vote_analysis_fb_page_details_wt_geo.RData')

# library(ggplot2)
# library(ggmap)
# map <- get_map(location='italy', source="stamen", maptype="watercolor", crop=FALSE, zoom = 6)
# 
# ggmap(map) +
#   geom_point(data = page_details[!is.na(page_details$gn) & page_details$gn_feature_code !='ADM1', ], aes(x = lon, y = lat, fill = "red", alpha = 0.8), size = 0.2)


details_of_geoed_orgs <- subset(page_details, !is.na(gn) & gn_feature_code != "ADM1")

# Add province
require(sp)
require(rgdal)
load("02_03_m5s_meetup_vote_spatial_data.RData")

wgs_crs <- CRS('+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0')

utmStr <- "+proj=utm +zone=%d +datum=NAD83 +units=m +no_defs +ellps=GRS80 +towgs84=0,0,0"
utm_crs <- CRS(sprintf(utmStr, 32))

xy <- cbind(details_of_geoed_orgs$lon, details_of_geoed_orgs$lat)
sp_points <-  SpatialPoints(xy, proj4string = wgs_crs)

## Transform
italy_map_2010 <- spTransform(italy_map_2010, utm_crs)
italy_map_2013 <- spTransform(italy_map_2013, utm_crs)
italy_map_2014 <- spTransform(italy_map_2014, utm_crs)
sp_points <- spTransform(sp_points, utm_crs)

res10 <- over(sp_points, italy_map_2010)
res13 <- over(sp_points, italy_map_2013)
res14 <- over(sp_points, italy_map_2014)

details_of_geoed_orgs$COD_PRO_2010 <- res10$COD_PRO
details_of_geoed_orgs$COD_PRO_2013 <- res13$COD_PRO
details_of_geoed_orgs$COD_PRO_2014 <- res14$COD_PRO
details_of_geoed_orgs$PROVINCIA_2010 <- res10$PROVINCIA
details_of_geoed_orgs$PROVINCIA_2013 <- res13$PROVINCIA
details_of_geoed_orgs$PROVINCIA_2014 <- res14$PROVINCIA

# Add n_liking_users, n_commenting_users
post <- sqliteGetTable(db_store, 'post')
names(post)[1] <- 'post_id' 

post <- merge(post, details_of_geoed_orgs[,c('id',"COD_PRO_2010","COD_PRO_2013",
                                             "COD_PRO_2014","PROVINCIA_2010","PROVINCIA_2013", "PROVINCIA_2014")], 
              by.x = 'page_id', by.y = 'id')

comment <- sqliteGetTable(db_store, 'comment')
post_likes <- sqliteGetTable(db_store, 'post_likes')

require(data.table)
post <- data.table(post)
comment <- data.table(comment)
post_likes <- data.table(post_likes)

setkey(post, 'post_id')
setkey(comment, 'post_id')
setkey(post_likes, 'post_id')

comment <- merge(comment, post[,.(post_id, page_id, 
                                  COD_PRO_2010, COD_PRO_2013, COD_PRO_2014, 
                                  PROVINCIA_2010, PROVINCIA_2013, PROVINCIA_2014)],
                 all.x = FALSE)
post_likes <- merge(post_likes, post[,.(post_id, page_id, created_time,
                                        COD_PRO_2010, COD_PRO_2013, COD_PRO_2014, 
                                        PROVINCIA_2010, PROVINCIA_2013, PROVINCIA_2014)],
                    all.x = FALSE)

# comment <- subset(comment, page_id %in% details_of_geoed_orgs$id)
# post_likes <- subset(post_likes, page_id %in% details_of_geoed_orgs$id)

comment$date <- as.Date(comment$created_time)
post_likes$date <-  as.Date(post_likes$created_time)

electionday <- list()
electionday[['2010']] <- as.Date('2010-03-28')
electionday[['2013']] <- as.Date('2013-02-24')
electionday[['2014']] <- as.Date('2014-05-25')

# Windows
days <- c(90, 60, 30, 7)

count <- list()

require(dplyr)
fb_count <- list()
fb_count[['2010']] <- data.frame(
  COD_PRO = unique(as.character(details_of_geoed_orgs$COD_PRO_2010)), 
  stringsAsFactors = F)
fb_count[['2013']] <- data.frame(
  COD_PRO = unique(as.character(details_of_geoed_orgs$COD_PRO_2013)), 
  stringsAsFactors = F)
fb_count[['2014']] <- data.frame(
  COD_PRO = unique(as.character(details_of_geoed_orgs$COD_PRO_2014)), 
  stringsAsFactors = F)

recurrentUser <- function(vec) {
  vec <- vec[duplicated(vec)]
  return(length(unique(vec)))
}

for (d in days) {
  for (e in electionday) {
    
    year <- gsub("-\\d{2}-\\d{2}", "", e)
    var_prefix <- paste0(year, "_", d, "days_")
    
    # Comments
    tmp_df <-
      subset(comment, date >= (e - d) & date <= e) %>%
      dplyr::group_by_(paste0("COD_PRO_", year)) %>%
      dplyr::summarize(comments = n(),
                       commenting_users = length(unique(from)),
                       commenting_2plus_users = recurrentUser(from))
    tmp_df[[1]] <- as.character(tmp_df[[1]])
    names(tmp_df)[2:4] <- 
      paste0(var_prefix, names(tmp_df)[2:4])
    fb_count[[year]] <- 
      merge(fb_count[[year]],  tmp_df, by.x = 'COD_PRO', by.y = paste0("COD_PRO_", year), all.x = TRUE)
    # Likes
    tmp_df <-
      subset(post_likes, date >= (e - d) & date <= e) %>%
      dplyr::group_by_(paste0("COD_PRO_", year)) %>%
      dplyr::summarize(likes = n(),
                       liking_users = length(unique(profile_id)),
                       liking_2plus_users = recurrentUser(profile_id))
    tmp_df[[1]] <- as.character(tmp_df[[1]])
    names(tmp_df)[2:4] <- paste0(var_prefix, names(tmp_df)[2:4])
    fb_count[[year]] <- 
      merge(fb_count[[year]],  tmp_df, by.x = 'COD_PRO', by.y = paste0("COD_PRO_", year), all.x = TRUE)
  }
}

save(fb_count, file = "02_04_m5s_meetup_vote_analysis_fb_count_by_province_mar15.RData")

# Control for weighted distance distance
load("02_04_m5s_meetup_vote_analysis_fb_node_distance.RData")

cutoff <- mean(df_distance$weighted_mean_dist, na.rm = T)
df_distance$exclude <- df_distance$weighted_mean_dist > cutoff
df_distance$exclude[is.na(df_distance$exclude)] <- FALSE
to_exclude <- as.character(df_distance[df_distance$exclude == TRUE ,]$page_id)
to_exclude <- gsub("^p","",to_exclude)

# targets$to_exclude_for_distance <- targets$id %in% to_exclude
# 
# require(ggplot2)
# ggplot(targets, aes(x=to_exclude_for_distance, y=likes)) +
#   geom_boxplot() +
#   scale_y_log10()

require(dplyr)
fb_count_dist <- list()
fb_count_dist[['2013']] <- data.frame(
  COD_PRO = unique(as.character(targets$COD_PRO_2013)), 
  stringsAsFactors = F)

for (d in days) {
  for (e in electionday) {
    
    year <- gsub("-\\d{2}-\\d{2}", "", e)
    var_prefix <- paste0("days_",d)
    
    # Posts
    tmp_df <-
      subset(post, date >= (e - d) & date <= e & !(page_id %in% to_exclude)) %>%
      dplyr::group_by_(paste0("COD_PRO_", year)) %>%
      dplyr::summarize(posts = n())
    tmp_df[[1]] <- as.character(tmp_df[[1]])
    names(tmp_df)[2] <- 
      paste0(var_prefix, names(tmp_df)[2])
    fb_count_dist[[year]] <- 
      merge(fb_count_dist[[year]],  tmp_df, by.x = 'COD_PRO', by.y = paste0("COD_PRO_", year), all.x = TRUE)
    
    
    # Comments
    tmp_df <-
      subset(comment, date >= (e - d) & date <= e & !(page_id %in% to_exclude)) %>%
      dplyr::group_by_(paste0("COD_PRO_", year)) %>%
      dplyr::summarize(comments = n(),
                       commenting_users = length(unique(from)),
                       commenting_2plus_users = recurrentUser(from))
    tmp_df[[1]] <- as.character(tmp_df[[1]])
    names(tmp_df)[2:4] <- 
      paste0(var_prefix, names(tmp_df)[2:4])
    fb_count_dist[[year]] <- 
      merge(fb_count_dist[[year]],  tmp_df, by.x = 'COD_PRO', by.y = paste0("COD_PRO_", year), all.x = TRUE)
    # Likes
    tmp_df <-
      subset(post_likes, date >= (e - d) & date <= e & !(page_id %in% to_exclude)) %>%
      dplyr::group_by_(paste0("COD_PRO_", year)) %>%
      dplyr::summarize(likes = n(),
                       liking_users = length(unique(profile_id)),
                       liking_2plus_users = recurrentUser(profile_id))
    tmp_df[[1]] <- as.character(tmp_df[[1]])
    names(tmp_df)[2:4] <- paste0(var_prefix, names(tmp_df)[2:4])
    fb_count_dist[[year]] <- 
      merge(fb_count_dist[[year]],  tmp_df, by.x = 'COD_PRO', by.y = paste0("COD_PRO_", year), all.x = TRUE)
  }
}
save(fb_count_dist, file = "02_04_m5s_meetup_vote_analysis_fb_count_dist_by_province.RData")

